# Input:    /Data/finalData.RData
# Output:   /Paper/Tables/main.tex
#           /Paper/Tables/main_SI.tex
#           /Paper/Figures/sensitivity_dyad.pdf
# Author:   JB


rm(list = ls())
require(tidyverse)
require(ggridges)
require(fixest)
require(marginaleffects)

# source('./helper_functions.R')
load('../data/finalData.RData')

toplot <- NULL
for(grs in c("(m not(sure|certain|positive|aware)|(do not|don't) know|m unsure)",
             "back to you|follow up with you|look into|(need|have) to (check|confer|look)",
             "(i am sorry|i'm sorry|i apologize|forgive me)")) {
  toplot <- utterance_level %>%
    # filter(nchars < 5000) %>%
    select(docID,chamber,opensecretsID,text,interrupted) %>%
    filter(grepl('FED',opensecretsID)) %>%
    group_by(opensecretsID) %>%
    mutate(nTot = n()) %>%
    filter(grepl('FED',opensecretsID),
           grepl(grs,tolower(text))) %>%
    # slice(1:10) %>%
    # pull(text) %>%
    # cat()
    count(opensecretsID,interrupted,nTot) %>%
    ungroup() %>%
    mutate(pct = n / nTot,
           opensecretsID = factor(str_to_title(gsub('FED','',opensecretsID)),
                                  levels = c('Greenspan','Bernanke','Yellen','Powell'))) %>%
    mutate(grp = grs) %>%
    bind_rows(toplot)
}

toplot <- toplot %>% 
  group_by(opensecretsID,grp) %>%
  mutate(lab = pct / sum(pct))  %>%
  ungroup() %>%
  mutate(grp = ifelse(grepl('sorry',grp),'Apologetic Language',
                      ifelse(grepl('unsure',grp),'Uncertain Language','Delaying Language')))

pdf('../output/figures/SI_uncertainty.pdf',width = 8,height = 4)
toplot %>%
  ggplot(aes(x = opensecretsID,y = pct,fill = factor(interrupted))) + 
  geom_bar(stat = 'identity',color = 'grey70',linewidth = .2) + 
  scale_y_continuous(labels = scales::percent) + 
  scale_fill_manual(values = c('white','grey30'),labels = c('Not interrupted','Interrupted')) + 
  labs(x = 'Fed Chair',
       y = 'Proportion of total utterances',
       fill = 'Utterance',
       title = 'Uncertainty and Obsequiousness',
       subtitle = 'Proportions of utterances with keywords, of which interrupted in gray') + 
  theme_bw() + 
  geom_text(data = toplot %>%
              filter(interrupted == 1),
            aes(label = paste0(round(lab*100,1),'%')),
            vjust = -.2) + 
  facet_wrap(~grp) + 
  theme(legend.position = 'bottom',
        axis.text.x = element_text(size = 8))
dev.off()

utterance_level %>%
  # filter(nchars < 5000) %>%
  select(docID,chamber,opensecretsID,text,interrupted) %>%
  filter(grepl('FED',opensecretsID)) %>%
  group_by(opensecretsID) %>%
  mutate(nTot = n()) %>%
  filter(grepl('FED',opensecretsID),
         grepl("check (on |into )*that|look into that|find out",tolower(text))) %>%
  # slice(1:10) %>%
  # pull(text) %>%
  # cat()
  count(opensecretsID,interrupted,nTot) %>%
  ungroup() %>%
  mutate(pct = n / nTot,
         opensecretsID = factor(str_to_title(gsub('FED','',opensecretsID)),
                                levels = c('Greenspan','Bernanke','Yellen','Powell'))) %>%
  ggplot(aes(x = opensecretsID,y = pct,fill = factor(interrupted))) + 
  geom_bar(stat = 'identity',color = 'grey70') + 
  scale_y_continuous(labels = scales::percent) + 
  scale_fill_manual(values = c('white','grey30'),labels = c('Not interrupted','Interrupted')) + 
  labs(x = 'Fed Chair',
       y = 'Proportion of total utterances',
       fill = 'Utterance',
       subtitle = '"I am not sure"') + 
  theme_bw()

utterance_level %>%
  filter(grepl('FED',opensecretsID)) %>%
  filter(grepl('sorry',text)) %>%
  pull(textclean)


utterance_level %>%
  # filter(nchars < 5000) %>%
  select(docID,chamber,opensecretsID,text,interrupted) %>%
  filter(grepl('FED',opensecretsID)) %>%
  group_by(opensecretsID) %>%
  mutate(nTot = n()) %>%
  ungroup() %>%
  filter(grepl("back to you|follow up with you",tolower(text))) %>%
  # slice(11) %>%
  # pull(text) %>%
  # cat()
  count(opensecretsID,interrupted,nTot) %>%
  ungroup() %>%
  mutate(pct = n / nTot,
         opensecretsID = factor(str_to_title(gsub('FED','',opensecretsID)),
                                levels = c('Greenspan','Bernanke','Yellen','Powell'))) %>%
  ggplot(aes(x = opensecretsID,y = pct,fill = factor(interrupted))) + 
  geom_bar(stat = 'identity',color = 'grey70') + 
  scale_y_continuous(labels = scales::percent) + 
  scale_fill_manual(values = c('white','grey30'),labels = c('Not interrupted','Interrupted')) + 
  labs(x = 'Fed Chair',
       y = 'Proportion of total utterances',
       fill = 'Utterance',
       subtitle = '"Back to you"') + 
  theme_bw()


utterance_level %>%
  filter(nchars < 1000) %>%
  filter(grepl('YELLEN',opensecretsID)) %>%
  slice(21:40) %>%
  summarise(text = paste(paste0('>',text),collapse = '\n\n')) %>%
  pull(text) %>%
  cat()

utterance_level %>%
  # filter(nchars < 5000) %>%
  select(docID,chamber,opensecretsID,text,interrupted) %>%
  filter(grepl('FED',opensecretsID)) %>%
  group_by(opensecretsID) %>%
  mutate(nTot = n()) %>%
  ungroup() %>%
  filter(grepl("(do not know|don't know|m not certain|m unsure)",tolower(text))) %>%
  # slice(3) %>%
  # pull(text) %>%
  # cat()
  count(opensecretsID,interrupted,nTot) %>%
  ungroup() %>%
  mutate(pct = n / nTot,
         opensecretsID = factor(str_to_title(gsub('FED','',opensecretsID)),
                                levels = c('Greenspan','Bernanke','Yellen','Powell'))) %>%
  ggplot(aes(x = opensecretsID,y = pct,fill = factor(interrupted))) + 
  geom_bar(stat = 'identity',color = 'grey70') + 
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(values = c('white','grey30'),labels = c('Not interrupted','Interrupted')) + 
  labs(x = 'Fed Chair',
       y = 'Proportion of total utterances',
       fill = 'Utterance',
       subtitle = '"do not know|not certain|unsure"') + 
  theme_bw()

utterance_level %>%
  # filter(nchars < 5000) %>%
  select(docID,chamber,opensecretsID,text,interrupted) %>%
  filter(grepl('FED',opensecretsID)) %>%
  group_by(opensecretsID) %>%
  mutate(nTot = n()) %>%
  ungroup() %>%
  filter(grepl("(i am sorry|i'm sorry|i apologize)",tolower(text))) %>%
  # slice(3) %>%
  # pull(text) %>%
  # cat()
  count(opensecretsID,interrupted,nTot) %>%
  ungroup() %>%
  mutate(pct = n / nTot,
         opensecretsID = factor(str_to_title(gsub('FED','',opensecretsID)),
                                levels = c('Greenspan','Bernanke','Yellen','Powell'))) %>%
  ggplot(aes(x = opensecretsID,y = pct,fill = factor(interrupted))) + 
  geom_bar(stat = 'identity',color = 'grey70') + 
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(values = c('white','grey30'),labels = c('Not interrupted','Interrupted')) + 
  labs(x = 'Fed Chair',
       y = 'Proportion of total utterances',
       fill = 'Utterance',
       subtitle = '"I am sorry|I apologize"') + 
  theme_bw()


# Can we just calculate the tf-idf for bi-grams and tri-grams for each FED chair please?
require(tidytext)
text <- utterance_level %>%
  filter(!is.na(speaker),
         grepl('FED',opensecretsID)) %>%
  mutate(uniqueID = paste0(opensecretsID,docID)) %>%
  select(uniqueID,textclean) %>%
  group_by(uniqueID) %>%
  summarise(textclean = paste(textclean,collapse = '\n')) %>%
  ungroup()

fedWords <- text %>%
  # unnest_tokens(word,textclean) %>%
  # anti_join(stop_words) %>%
  # group_by(uniqueID) %>%
  # summarise(textclean = paste(word,collapse = ' ')) %>%
  unnest_tokens(word,textclean,token = 'ngrams',n = 2) %>%
  count(uniqueID,word)

totWords <- fedWords %>%
  group_by(uniqueID) %>%
  summarise(total = sum(n))

fedWords <- fedWords %>%
  left_join(totWords)

fedWords <- fedWords %>%
  bind_tf_idf(word,uniqueID,n)

fedWords %>%
  arrange(desc(tf_idf)) %>%
  filter(grepl('YELLEN',uniqueID)) %>%
  slice(121:180) %>%
  print(n = 60)

it = itoken(text$textclean,ids = paste0(text$opensecretsID,text$docID),progressbar = FALSE)
v = create_vocabulary(it,ngram = c(2,4),stopwords = c(gsub("'",'',stopwords())))
v = prune_vocabulary(v,doc_proportion_max = .5,term_count_min = 2)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it,vectorizer)
model_tfidf = TfIdf$new()
dtm_tfidf = model_tfidf$fit_transform(dtm)


utterance_level %>%
  # filter(nchars < 5000) %>%
  select(docID,chamber,opensecretsID,text,interrupted) %>%
  filter(grepl('FED',opensecretsID)) %>%
  group_by(opensecretsID) %>%
  mutate(nTot = n()) %>%
  ungroup() %>%
  filter(grepl("(i think|i would|i should)",tolower(text))) %>%
  # slice(3) %>%
  # pull(text) %>%
  # cat()
  count(opensecretsID,interrupted,nTot) %>%
  ungroup() %>%
  mutate(pct = n / nTot,
         opensecretsID = factor(str_to_title(gsub('FED','',opensecretsID)),
                                levels = c('Greenspan','Bernanke','Yellen','Powell'))) %>%
  ggplot(aes(x = opensecretsID,y = pct,fill = factor(interrupted))) + 
  geom_bar(stat = 'identity',color = 'grey70') + 
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(values = c('white','grey30'),labels = c('Not interrupted','Interrupted')) + 
  labs(x = 'Fed Chair',
       y = 'Proportion of total utterances',
       fill = 'Utterance',
       subtitle = '"i think|i would|i should"') + 
  theme_bw()


utterance_level %>%
  filter(nchars < 1000) %>%
  select(docID,chamber,opensecretsID,text,interrupted) %>%
  filter(grepl('FED',opensecretsID),
         grepl('not sure|check on that|back to you',text)) %>%
  slice(1) %>%
  pull(text) %>%
  cat()
  count(opensecretsID,interrupted) %>%
  ggplot(aes(x = opensecretsID,y = n,fill = factor(interrupted))) + 
  geom_bar(stat = 'identity',
           position = 'fill')

utterance_level %>%
  filter(nchars < 1000) %>%
  select(docID,chamber,opensecretsID,text) %>%
  filter(grepl('FED',opensecretsID),
         grepl('percent',text)) %>%
  # head()
  # slice(1) %>%
  # pull(text) %>%
  # cat()
  count(opensecretsID)
